Cleaning up environmental salinity data: Going to remove flagged data first and then hopefully I can use a 7 day rolling mean window to remove outliers. Using rolling mean to remove data points that are +/- 2 standard deviations away from mean.

EOS pier/Tiburon

Clear environment, load packages, read in data.

rm(list=ls())

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
library(ggpubr)
library(scales)
library(chron)
library(plotly)
library(taRifx)
library(aweek)
library(easypackages)
## Warning: package 'easypackages' was built under R version 4.0.3
library(renv)
## Warning: package 'renv' was built under R version 4.0.3
library(here)
## Warning: package 'here' was built under R version 4.0.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.0.3
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.0.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.0.3
## Warning: package 'quantmod' was built under R version 4.0.3
library(recipes) 
library(cranlogs)
## Warning: package 'cranlogs' was built under R version 4.0.3
library(knitr)

eos<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/eos_bouy_r.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

Convert to date column

eos$date<-as.Date(eos$date, format = c("%m/%d/%Y"))

Subset by year

eos17<-eos[- grep("2017", eos$date),]
eos18<-eos[- grep("2018", eos$date),]
eos19<-eos[- grep("2019", eos$date),]

Plot raw data by year to view trends

all<-eos%>%
  ggplot(aes(x=date, y=salinity, color=salinity))+
  geom_point(alpha=0.5)+ylim(0,35)+
  labs(title="Salinity data from EOS resesarch pier",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

e17<-eos17%>%
  ggplot(aes(x=date, y=salinity, color=salinity))+
  geom_point(alpha=0.5)+ylim(0,35)+
  labs(title="Salinity data from EOS resesarch pier",
       subtitle="01/01/2017 - 12/31/2017",
       caption= "data courtesy of CeNCOOS")
e18<-eos18%>%
  ggplot(aes(x=date, y=salinity, color=salinity))+
  geom_point(alpha=0.5)+ylim(0,35)+
  labs(title="Salinity data from EOS resesarch pier",
       subtitle="01/01/2018 - 12/31/2018",
       caption= "data courtesy of CeNCOOS")
e19<-eos19%>%
  ggplot(aes(x=date, y=salinity, color=salinity))+
  geom_point(alpha=0.5)+ylim(0,35)+
  labs(title="Salinity data from EOS resesarch pier",
       subtitle="01/01/2019 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

ggarrange(all, e17, e18,e19, ncol= 1, nrow = 4)
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

Remove flagged data –> don’t see for this data set. Going to apply a filter to get rid of values greater than 35. This will get rid of the two largest outliers.

eos<-filter(eos, salinity <35)

Make function

custom_stat_fun_2<-function(x, na.rm=TRUE){
  m<-mean(x, na.rm=TRUE)
  s<- sd(x, na.rm=TRUE)
  hi<-m+2*s
  lo<-m-2*s
  ret<-c(mean=m, stdev=s, hi.95=hi, lo.95=lo)
}

Roll apply using custom stat function. Width will depend on the frequencey of the time series. Data collected every 6 minutes so width=1680 is 7 days, 3360 is 14 days, 7200 is 30 days. 240 observations per day.

Rolling 7 day mean:

eos_week<-eos%>%
  tq_mutate(
    select= salinity,
    mutate_fun= rollapply,
    width= 1680,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter and graph. Top=raw, bottom = filtered

eos_clean_data<-filter(eos_week, salinity <hi.95 & salinity >lo.95)

eos_graph<- ggplot(data = eos_clean_data, mapping = aes(x = date, y = salinity, color=salinity)) + geom_point()+
  labs(title="Salinity data from EOS resesarch pier REMOVED",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

eos_raw<-eos%>%
  ggplot(aes(x=date, y=salinity, color=salinity))+
  geom_point(alpha=0.5)+ylim(0,35)+
  labs(title="Salinity data from EOS resesarch pier ALL",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

ggarrange(eos_raw, eos_graph, ncol= 1, nrow = 2)

Richardson Bay

Clear environment except for what we need “eos_graph” and the function we made “custom_stat_fun_2”. Read in data.

rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2")])

rb<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/richardson_bay_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

Convert to date

rb$date<-as.Date(rb$DateTimeStamp, format = c("%m/%d/%y"))

Subset by year

rb17<-rb[- grep("2017", rb$date),]
rb18<-rb[- grep("2018", rb$date),]
rb19<-rb[- grep("2019", rb$date),]

Plot raw data

rball<-rb%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Richardson Bay",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")
   
r17<-rb17%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Richardson Bay",
       subtitle="01/01/2017 - 12/31/2017",
       caption= "data courtesy of NERR")
   
r18<-rb18%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Richardson Bay",
       subtitle="01/01/2018 - 12/31/2018",
       caption= "data courtesy of NERR")
   
r19<-rb19%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Richardson Bay",
       subtitle="01/01/2019 - 12/31/2019",
       caption= "data courtesy of NERR")

ggarrange(rball, r17, r18,r19, ncol= 1, nrow = 4)
## Warning: Removed 1496 rows containing missing values (geom_point).
## Warning: Removed 1435 rows containing missing values (geom_point).
## Warning: Removed 1069 rows containing missing values (geom_point).
## Warning: Removed 488 rows containing missing values (geom_point).

Removed flagged data that did not pass QAQC. F_sal with a -3 indicates fail

rb<-rb[- grep("-3", rb$F_Sal),]

Roll apply using custom stat function. Again, width will depend on the frequencey of the time series. Data collected every 15 min so width = 672 is 7 days, 1344 is 14 days, 2880 is 30 days. 96 observations per day. Since 30 days worked for eos data I’m going to the same here so that it’s constent.

7 day window

rb_week<-rb%>%
  tq_mutate(
    select= Sal,
    mutate_fun= rollapply,
    width= 672,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter and graph. Top = raw, bottom = filtered

rb_clean_data<-filter(rb_week, Sal <hi.95 & Sal >lo.95)

rb_graph<- ggplot(data = rb_clean_data, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
  labs(title="Salinity data from Richardson Bay REMOVED",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

rb_all<-ggplot(data = rb, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
  labs(title="Salinity data from Richardson Bay RAW",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

ggarrange(rb_all, rb_graph, ncol= 1, nrow = 2)
## Warning: Removed 1496 rows containing missing values (geom_point).

China Camp

Clear environment. We need to keep “eos_graph”, “rb_graph”, and “custom_stat_fun_2”. Read in data.

rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2", "rb_graph")])

cc<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/china_camp_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

Convert to date column

cc$date<-as.Date(cc$DateTimeStamp, format = c("%m/%d/%Y"))

Subset by year

cc17<-cc[- grep("2017", cc$date),]
cc18<-cc[- grep("2018", cc$date),]
cc19<-cc[- grep("2019", cc$date),]

Plot raw data

ccall<-cc%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from China Camp",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")
c17<-cc17%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from China Camp",
       subtitle="01/01/2017 - 12/31/2017",
       caption= "data courtesy of NERR")
c18<-cc18%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from China Camp",
       subtitle="01/01/2018 - 12/31/2018",
       caption= "data courtesy of NERR")
c19<-cc19%>%
  ggplot(aes(x=date, y=Sal, color=Sal))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from China Camp",
       subtitle="01/01/2019 - 12/31/2019",
       caption= "data courtesy of NERR")
ggarrange(ccall, c17, c18,c19, ncol= 1, nrow = 4)  
## Warning: Removed 835 rows containing missing values (geom_point).
## Warning: Removed 315 rows containing missing values (geom_point).
## Warning: Removed 523 rows containing missing values (geom_point).
## Warning: Removed 832 rows containing missing values (geom_point).

Remove flagged data

cc<-cc[- grep("-3", cc$F_Sal),]

Roll apply using custom stat function. Again, width will depend on the frequencey of the time series. Data collected every 15 min so width = 672 is 7 days, 1344 is 14 days, 2880 is 30 days. 96 observations per day. Since 30 days worked for eos data I’m going to the same here so that it’s constent.

7 day window

cc_week<-cc%>%
  tq_mutate(
    select= Sal,
    mutate_fun= rollapply,
    width= 672,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter and graph. Top=raw, bottom = filtered

cc_clean_data<-filter(cc_week, Sal <hi.95 & Sal >lo.95)

cc_graph<- ggplot(data = cc_clean_data, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
  labs(title="Salinity data from China Camp REMOVED",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

cc_raw<-ggplot(data = cc, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
  labs(title="Salinity data from China Camp RAW",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of NERR")

ggarrange(cc_raw, cc_graph, ncol=1, nrow=2)
## Warning: Removed 835 rows containing missing values (geom_point).

Fort Point

Clear environment. We need to keep “eos_graph”, “rb_graph”, “cc_graph” and “custom_stat_fun_2”. Read in data.

rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2", "rb_graph","cc_graph")])

fp<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/fort_point_r.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)

Convert to date column

fp$date<-as.Date(fp$date, format = c("%Y-%m-%d"))

Subset by year

fp17<-fp[- grep("2017", fp$date),]
fp18<-fp[- grep("2018", fp$date),]
fp19<-fp[- grep("2019", fp$date),]

Plot raw data

fpall<-fp%>%
  ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Fort Point",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")
f17<-fp17%>%
  ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Fort Point",
       subtitle="01/01/2017 - 12/31/2017",
       caption= "data courtesy of CeNCOOS")
f18<-fp18%>%
  ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Fort Point",
       subtitle="01/01/2018 - 12/31/2018",
       caption= "data courtesy of CeNCOOS")
f19<-fp19%>%
  ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
  geom_point(alpha=0.5)+
  labs(title="Salinity data from Fort Point",
       subtitle="01/01/2019 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")
ggarrange(fpall, f17, f18, f19, ncol=1, nrow=4)   

Removed flagged data:sea_water_practical_salinity_qc_tests = 4 indicates FAIL.

fp<-filter(fp, sea_water_practical_salinity_qc_agg!=4)

Roll apply using custom stat function. Data collected every 6 minutes so width=1680 is 7 days, 3360 is 14 days, 7200 is 30 days. 240 observations per day.

7 day window

fpweek<-fp%>%
  tq_mutate(
    select= sea_water_practical_salinity,
    mutate_fun= rollapply,
    width= 1680,
    align= "right",
    by.column= FALSE,
    FUN= custom_stat_fun_2,
    na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique

Filter and graph. Raw=top, filtered=bottom

fp_clean_data<-filter(fpweek, sea_water_practical_salinity <hi.95 & sea_water_practical_salinity >lo.95)

fp_graph<- ggplot(data = fp_clean_data, mapping = aes(x = date, y = sea_water_practical_salinity, color=sea_water_practical_salinity)) + geom_point()+
  labs(title="Salinity data from Fort Point REMOVED",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")

fp_raw<-ggplot(data = fp, mapping = aes(x = date, y = sea_water_practical_salinity, color=sea_water_practical_salinity)) + geom_point()+
  labs(title="Salinity data from Fort Point RAW",
       subtitle="01/01/2017 - 12/31/2019",
       caption= "data courtesy of CeNCOOS")
ggarrange(fp_raw, fp_graph, ncol=1, nrow=2)

All together Now that I have all my graphs, I’m going to stack them. This is the salinity data from 01/01/2017-12/31/2019. I first removed data points that failed QC tests. I was able to do this for all sites except for EOS pier where I didn’t see that column available for download. Instead I just filtered to remove salinity points greater than 100 which looks like it worked. Next step is to get hourly averages.

ggarrange(cc_graph, eos_graph,rb_graph,fp_graph,  ncol= 1, nrow = 4)